Background

The following is a preliminary data analysis of nitrate spikes in water after rainfall events at different locations in the Midwest.

Nitrate data comes from the US Geological Survey.

Precipitation data comes from NOAA’s Local Climatological Data.

Since there aren’t USGS monitoring locations in every “hot-spot” county from our previous analysis, we focused on finding locations that met the following criteria:

  1. Monitors the parameter “99133,” or nitrates in milligrams per Liter.
  2. Location is still active today
  3. Had at least one nitrate spike above the federal threshold of 10mg/L this year
  4. Its data goes as far back as possible

Part 1

First, we loaded the R libraries we will use, including dataRetrieval. This is a package that allows us to request USGS data from R Studio.

library(dataRetrieval)
library(tidyverse)
library(lubridate)
library(rvest)
library(leaflet)
library(leaflet.providers)
library(dygraphs)
library(sp)
library(rgeos)
library(xts)
library(htmlwidgets)
library(data.table)
library(knitr)
library(DT)

Then, we will run the following code bit to each Midwest state. It uses a dataRetrieval. function to request all the USGS monitoring locations at the given state and then filters them by keeping the ones that are still active and that have nitrate data. This will give us the site number, coordinates and how long each location has been active.

Note: North Dakota is not included below since it does not have any nitrate monitoring location.

IL_site <- readNWISdata(stateCd= "IL", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 12 obs of 5 var

IN_site <- readNWISdata(stateCd= "IN", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 10 obs of 5 var

IA_site <- readNWISdata(stateCd= "IA", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 11 obs of 5 var

KS_site <- readNWISdata(stateCd= "KS", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 4 obs of 5 var

MI_site <- readNWISdata(stateCd= "MI", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 3 obs of 5 var

MN_site <- readNWISdata(stateCd= "MN", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 0 obs of 5 var

MO_site <- readNWISdata(stateCd= "MO", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 3 obs of 5 var

NE_site <- readNWISdata(stateCd= "NE", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 0 obs of 5 var

OH_site <- readNWISdata(stateCd= "OH", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 2 obs of 5 var

SD_site <- readNWISdata(stateCd= "SD", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 3 obs of 5 var

WI_site <- readNWISdata(stateCd= "WI", parameterCd="99133",
                        service="site", seriesCatalogOutput=TRUE) %>% 
  filter(site_tp_cd == "ST") %>%
  filter(end_date == "2021-08-24") %>%
  filter(parm_cd == "99133") %>%
  distinct(site_no, .keep_all = TRUE) %>%
  select(site_no, station_nm, lat = dec_lat_va,
         long = dec_long_va, begin_date) # 0 obs of 5 var

After running the above code, we noticed that in addition to ND; MN, NE and WI do not have monitoring locations that are still active. Then we made the following function that does the following:

  1. Pulls all the site numbers from a given state and requests nitrate data from this year.

  2. It aggregates the data to a find the maximum nitrate reading, yearlyPeak.

  3. It creates a new column that detects if the maximum nitrate level in each location was above 10 mg/L or not.

  4. It removes sites that did not have a nitrate peak higher than the federal threshold this year.

  5. It arranges the remainder locations from oldest to newest and selects the top one.

This means the function will chose one location per state that has data that goes as far back as possible and that had a nitrate spike above federal guidelines.

state_function <- function(state){
  
  site <- state
  
  # nitrate levels from this year
  iv <- readNWISdata(siteNumbers = site$site_no, parameterCd = "99133",
                     startDate = "2021-01-01", endDate = "2021-08-18",
                     service = "iv")
  
  # now aggregate by yearly max 
  iv$year <- year(iv$dateTime)
  
  yearPeak <- iv %>% 
    group_by(site_no, year) %>%
    summarise(yearlyPeak = max(X_99133_00000, na.rm = T)) %>%
    filter(yearlyPeak <= 40) %>% # wonky filter 
    select(-year)
  
  # join these to "site", sort by date, pick oldest one above 10mg/L  
  site <- site %>% 
    right_join(yearPeak) %>%
    arrange(begin_date) %>%
    mutate(illegal = case_when(
      yearlyPeak >= 10 ~ "Yes",
      yearlyPeak < 10 ~ "No")) %>%
    filter(illegal == "Yes") %>%
    select(-illegal) %>%
    head(1) # this is our top location at this state
  
  return(site)
}

Now we can run the function to each Midwest state that has an active nitrate monitoring location:

IL <- state_function(IL_site)
IN <- state_function(IN_site)
IA <- state_function(IA_site)
KS <- state_function(KS_site)
MI <- state_function(MI_site)
MO <- state_function(MO_site)
OH <- state_function(OH_site)
SD <- state_function(SD_site)

IA, IL, IN, MI, and MO have at least one monitoring locations based on the criteria we used to filter the places. However, MI and MO only go back one year.

Hence, we decided to use IA, IL and IN as examples in this analysis.

# combine the 3 left 
usgs <- rbind(IA, IL, IN) # most recent date is 2015-03-20 from IN

usgs_simple <- usgs %>% select(name = station_nm,
                               lat,
                               long,
                               begin_date) 

usgs_simple$type <- "USGS"

rm(IA_site,IL_site,IN_site,KS_site,MI_site,MN_site,
   MO_site,NE_site,OH_site,SD_site,WI_site,
   KS, MI, MO, SD, IA, IL, IN, OH)

Part 2

NOAA has several precipitation data sets. For this analysis we used their Local Climatological Data.

In order to find the closest weather station to the three USGS locations we chose, we scraped the weather station’s coordinates (because they were not included in the data we downloaded.)

The first part was scraped, through a Chrome plugin, to get the links from each weather station in Iowa, Illinois and Indiana– the three states where we have our USGS locations. There is a total of 154 weather stations in those states.

After we compiled those links, we imported them into R to do the rest of the scraping from here.

# import csv I manually scraped
airport_links <- read_csv("Data/NOAA/airport_links_clean.csv")

# now make a list with all the links
airport_list <- as.list(airport_links$link)

Then we created another function that scrapes two tables for each station. One contains its coordinates and the second contains data about the start and end date of its data collection.

# make a function that will scrape both tables and join them, uses list index
link_function <- function(index){
  
  station <- read_html(airport_list[[index]])
  
  table <- station %>% html_table()
  
  first_t <- table[[1]]
  second_t <- table[[2]]
  colnames(second_t) <- colnames(first_t)
  combined <- rbind(first_t, second_t)
  
  return(combined)
  
}

Now we can run the function to each link using its index number on the list we imported.

After those tables are compiled into a single data frame, we transposed it and cleaned up its format.

# transpose 
all <- as.data.frame(t(all))

# fix header and row names 
colnames(all) <- all[1, ]

all <- all %>% filter(Name != "Name") 

rownames(all) <- NULL

all <- all %>% select(name = Name,
                      lat_long = `Latitude/Longitude`,
                      begin_date = `Start Date¹`,
                      end_date = `End Date¹`)

# clean lat_long columns 

all <- all %>% separate(lat_long, c("lat", "long"), ",")

all$lat <- gsub("°", "", all$lat)
all$long <- gsub("°", "", all$long)

Then we removed the stations that are no longer active and this is what we have left:

# then remove by end date, not active now 

all <- all %>% filter(end_date == "2021-08-23") %>%
  select(-end_date) # 145 obs of 4 var

all$type <- "NOAA"

rm(airport_links, airport_list)

Part 3

Once we had all active weather stations in IA, IL and IN, we paired that data to our three USGS monitoring locations to find their neighbors, or closest corresponding station, using their coordinates.

# join usgs_simple and all
locations <- rbind(usgs_simple, all)
locations$lat <- as.double(locations$lat)
locations$long <- as.double(locations$long)

Here is a map that plots all these:

# map these out
station_color <- colorFactor(c("#139bf0", "#e69f07"), domain=c("NOAA", "USGS"))

leaflet(locations) %>%
  addProviderTiles(providers$Stamen.Toner) %>%
  addCircles(~long, ~lat, popup=locations$name, weight = 3, radius=7000, 
             color=~station_color(type), stroke = F, fillOpacity = 0.8)

For some it is easy to visually see which weather station is closest to each USGS site, but just to be sure, we used the following bit to calculate each neighbor.

sp.locations <- locations
coordinates(sp.locations) <- ~long+lat

d <- gDistance(sp.locations, byid=T)
min.d <- apply(d, 1, function(x) order(x, decreasing=F)[2])

neighbors <- cbind(locations, locations[min.d,], apply(d, 1, function(x) sort(x, decreasing=F)[2]))

colnames(neighbors) <- c(colnames(locations),"neighbor", "lat2", "long2", "begin_date2", "type2", "apply")

neighbors <- filter(neighbors, type == "USGS") %>%
  select(-apply)

rm(d, sp.locations, min.d, usgs_simple)

Part 4

After requesting precipitation data from NOAA’s LCD website for the last 5 years (as far as nitrate monitoring goes on our 3 locations), we also request nitrate data for the same time frame within R Studio.

# request USGS from 2015-03-20 to 2021-08-01
usgs_data <- readNWISdata(siteNumbers = usgs$site_no, parameterCd = "99133",
                   startDate = "2015-03-20", endDate = "2021-08-18",
                   service = "iv")

# clean columns
usgs_data <- usgs_data %>% select(site_no,
                                  datetime = dateTime,
                                  nitrate_lv = X_99133_00000)

usgs_data$date <- as.Date(usgs_data$datetime) # 602,152 obs of 4 var

# separate data by each location 
usgs_IL <- usgs_data %>% filter(site_no == "03336890") # 172,535 obs of 4 var
usgs_IA <- usgs_data %>% filter(site_no == "05482300") # 218,930 obs of 4 var
usgs_IN <- usgs_data %>% filter(site_no == "05524500") # 210,687 obs of 4 var

rm(usgs_data)

Nitrate data is very granular, as the readings are taken every 15 minutes. So, we aggregated those to the maximum daily levels, daily_peak, to get an idea how high they can go at a given day after a precipitation event.

# aggregate to peak nitrate levels per day 

usgs_IL_peak <- usgs_IL %>% group_by(date) %>%
  summarise(daily_peak = max(nitrate_lv, na.rm = T)) # 1826 obs of 2 var

usgs_IA_peak <- usgs_IA %>% group_by(date) %>%
  summarise(daily_peak = max(nitrate_lv, na.rm = T)) # 2331 obs of 2 var

usgs_IN_peak <- usgs_IN %>% group_by(date) %>%
  summarise(daily_peak = max(nitrate_lv, na.rm = T)) # 2304 obs of 2 var

rm(usgs_IA, usgs_IL, usgs_IN)

NOAA’s data is hourly precipitation, so we aggregated this data as total daily precipitation for each station. This aggregation is done through another short function we made.

# now import the NOAA data I requested on their LCD website

storm_lake_IA <- fread("Data/NOAA/storm_lake.csv") # 166,905 obs of 124 var
rantoul_IL <- fread("Data/NOAA/rantoul.csv") # 167,763 obs of 124 var
jasper_IN <- fread("Data/NOAA/jasper.csv") # 166,952 obs of 124 

# make a function that will clean up NOAA data
# into the format we need 

precipitation_function <- function(airport){
  
  # remove some columns 
  airport <- airport %>% select(datetime = DATE,
                                      precip_hour = HourlyPrecipitation) 
  
  airport$date <- date(airport$datetime)
  
  # make NA into 0s
  airport <- airport %>% mutate(precip_hour = replace_na(precip_hour, 0))
  
  # aggregate to total daily precip
  airport <- airport %>% group_by(date) %>%
    summarise(total_precip = sum(precip_hour)) 
  
  return(airport)
}

# now run the function to each state

storm_lake_IA <- precipitation_function(storm_lake_IA) # 2331 obs of 2 var
rantoul_IL <- precipitation_function(rantoul_IL) # 2343 obs of 2 var
jasper_IN <- precipitation_function(jasper_IN) # 2335 obs of 2 var

Now that both our nitrate data and precipitation data are on the same formats, we made interactive graphics with the dyGraph package.

# Create the xts (format different than DF) necessary to use dygraph
nitrates_IA <- xts(x = usgs_IA_peak$daily_peak, order.by = usgs_IA_peak$date)
precipitation_IA <- xts(x = storm_lake_IA$total_precip, order.by = storm_lake_IA$date)
IA_xts <- merge(nitrates_IA, precipitation_IA, join = 'left')
rm(nitrates_IA, precipitation_IA, usgs_IA_peak, storm_lake_IA)

nitrates_IL <- xts(x = usgs_IL_peak$daily_peak, order.by = usgs_IL_peak$date)
precipitation_IL <- xts(x = rantoul_IL$total_precip, order.by = rantoul_IL$date)
IL_xts <- merge(nitrates_IL, precipitation_IL, join = 'left')
rm(nitrates_IL, precipitation_IL, usgs_IL_peak, rantoul_IL)

nitrates_IN <- xts(x = usgs_IN_peak$daily_peak, order.by = usgs_IN_peak$date)
precipitation_IN <- xts(x = jasper_IN$total_precip, order.by = jasper_IN$date)
IN_xts <- merge(nitrates_IN, precipitation_IN, join = 'left')
rm(nitrates_IN, precipitation_IN, usgs_IN_peak, jasper_IN)

# plot
IA <- dygraph(IA_xts, main = "Nitrates and precipitation in IA location, 2015-2021") %>%
  dyAxis("y", label = "Daily Peak Nitrates (mg/L)") %>%
  dyAxis("y2", label = "Total Daily Precipitation (in)", independentTicks = TRUE) %>%
  dySeries("precipitation_IA", axis = 'y2') %>%
  dyRangeSelector(height = 20) %>%
  dyHighlight(highlightCircleSize = 5) %>%
  dyShading(from = 10, axis = "y",to = 50, color = "#FFE6E6") %>%
  dyOptions(colors = c("#a97de3", "#6fc978"), drawGrid = F) %>%
  dyLegend(labelsSeparateLines = T)

IL <- dygraph(IL_xts, main = "Nitrates and precipitation in IL location, 2015-2021") %>%
  dyAxis("y", label = "Daily Peak Nitrates (mg/L)") %>%
  dyAxis("y2", label = "Total Daily Precipitation (in)", independentTicks = TRUE) %>%
  dySeries("precipitation_IL", axis = 'y2') %>%
  dyRangeSelector(height = 20) %>%
  dyHighlight(highlightCircleSize = 5) %>%
  dyShading(from = 10, axis = "y",to = 50, color = "#FFE6E6") %>%
  dyOptions(colors = c("#a97de3", "#6fc978"), drawGrid = F) %>%
  dyLegend(labelsSeparateLines = T)

IN <- dygraph(IN_xts, main = "Nitrates and precipitation in IN location, 2015-2021") %>%
  dyAxis("y", label = "Daily Peak Nitrates (mg/L)") %>%
  dyAxis("y2", label = "Total Daily Precipitation (in)", independentTicks = TRUE) %>%
  dySeries("precipitation_IN", axis = 'y2') %>%
  dyRangeSelector(height = 20) %>%
  dyHighlight(highlightCircleSize = 5) %>%
  dyShading(from = 10, axis = "y",to = 50, color = "#FFE6E6") %>%
  dyOptions(colors = c("#a97de3", "#6fc978"), drawGrid = F) %>%
  dyLegend(labelsSeparateLines = T)

Viz

IA
IL
IN

We understand one of the limitations from this analysis are the different watershed characteristics. As reference, here are the drainage basins from each of the three locations using the USGS application, StreamStats.

Iowa:

Illinois:

Indiana: